/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.util.ArrayTokenizer;
import mosdi.util.InvalidInputFileException;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class CutOutMotifSubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <sequences.fasta> <iupac-pattern>\n" +
		"\n" +
		"Outputs a FASTA file where pattern instances have been cut out.\n" +
		"All characters not in {A,C,G,T} are also cut out.\n" +
		"\n" +
		"Options:\n" +
		"  -r: also cut out matches of reverse complementary motif\n" +
		"  -m <minlength>: discard sequences shorter than this length (default:1)";
	}
	
	@Override
	public String description() {
		return "Removes a given motif from given sequences.";
	}

	@Override
	public String name() {
		return "cut-out-motif";
	}

	@Override
	public int run(String[] args) {
		parseOptions(args, 2, "rm:");

		// Option dependencies
		// -- none --

		// Mandatory arguments
		String filename = getStringArgument(0);
		String pattern = getStringArgument(1);

		// Options
		boolean considerReverse = getBooleanOption("r", false);
		int minLength = getPositiveIntOption("m", 1);

		Alphabet alphabet = Alphabet.getDnaAlphabet();
		List<NamedSequence> sequences = null;
		try {
			sequences = SequenceUtils.readFastaFile(filename, alphabet, true);
		} catch (FileNotFoundException e) {
			Log.errorln("Input file not found.");
			System.exit(1);
		} catch (IOException e) {
			Log.errorln("Error reading input file: "+e.getMessage());
			System.exit(1);
		} catch (InvalidInputFileException e) {
			Log.errorln("Invalid FASTA file: "+e.getMessage());
			System.exit(1);
		}
		
		// construct automaton
		CDFA cdfa = DFAFactory.buildFromIupacPattern(pattern, considerReverse, 10000);
		for (NamedSequence ns : sequences) {
			int n = 0;
			ArrayTokenizer tokenizer = new ArrayTokenizer(ns.getSequence(), -1);
			while (tokenizer.hasNext()) {
				int[] sequence = tokenizer.next();
				int begin = 0;
				int state = cdfa.getStartState();
				for (int pos = 0; pos<sequence.length; ++pos) {
					state = cdfa.getTransitionTarget(state, sequence[pos]);
					if (cdfa.getStateOutput(state)>0) {
						int end = pos - pattern.length();
						if (end-begin+1>=minLength) {
							int[] substring = Arrays.copyOfRange(sequence, begin, end+1);
							Log.printf(Log.Level.STANDARD, ">%s;part%d\n%s\n", ns.getName(), n++, alphabet.buildString(substring));
						}
						begin = pos+1;
					}
				}
				if (sequence.length-begin>=minLength) {
					int[] substring = Arrays.copyOfRange(sequence, begin, sequence.length);
					Log.printf(Log.Level.STANDARD, ">%s;part%d\n%s\n", ns.getName(), n++, alphabet.buildString(substring));
				}
			}
		}
		return 0;
	}
}
